__all__ = (
    'CorpusDataset',
)

from pathlib import Path

from common.chinese_char_dataset import ChineseCharDataset


class CorpusDataset:
    def __init__(self, *, chn_char_data: ChineseCharDataset, in_path: Path):
        self.in_path = in_path
        self.chn_data = chn_char_data

    def segment_generator(self):
        line_num = 0
        in_file = self.in_path.open('r', encoding='utf-8')

        for in_line in in_file:
            line_num += 1
            if line_num % 1000 == 0:
                print(f'正在读取新闻数据集第 {line_num} 行')

            segment = []
            for c in in_line:
                if self.chn_data.is_chn_in_table(c):
                    segment.append(c)
                else:
                    if len(segment) >= 2:
                        yield ''.join(segment)
                    if len(segment) > 0:
                        segment = []

        in_file.close()
